Test the model on data from 2015

Load .RData

load('.RData')
load('trips_2015.RData')
trips_per_day_2015 <- trips_per_day_2015 %>%
  left_join(holiday, by = "ymd") %>% 
  mutate(is_holiday = !(is.na(holiday_name)),
         weekday = weekdays(as.POSIXct(ymd), abbreviate = T),
         month = format(ymd,"%m"),
         is_flu_season = month %in% flu_season) %>% 
  select(-holiday_name, -day_num, -month) %>% 
  rename(num_trips = trip_num)
trips_per_day_2015$tmin <-trips_per_day_2015$tmin/10
trips_per_day_2015$tmax <-trips_per_day_2015$tmax/10
test_R_square <- rsquare(model, trips_per_day_2015)
test_R_square
## [1] 0.7245831
rmse(model, trips_per_day_2015)
## [1] 8036.401
plot_test_data<- trips_per_day_2015 %>% 
  add_predictions(model)


ggplotly(ggplot(plot_test_data, aes(x= ymd, y = pred))+
  geom_point(aes(y= num_trips))+
  geom_line(aes(y=pred), color = "red")+
  geom_point(aes(y=pred), color = "red") +
  geom_smooth() +
  xlab("Date") +
  ylab("Predicted (in red)/ Actual (in black)")+
    ggtitle("Number of trips at different dates"))
ggplot(plot_test_data, aes(x=pred, y =num_trips ))+
  geom_point()+
  geom_abline(linetype = "dashed") +
  xlab('Predicted') +
  ylab('Actual')

Number of trips in mid-September through early November were unusually high compare to 2014. This explains why the R^2 is 0.7245831 when fitting the model to 2015 data.

Comparing with 2014 data

plot_train_data<- rbind(train_data,validate_data, test_data)
plot_train_data <- plot_train_data %>% 
  add_predictions(model) %>% 
  filter(ymd!="2014-04-30")

ggplotly(ggplot(plot_train_data, aes(x= ymd, y = pred))+
  geom_point(aes(y= num_trips))+
  geom_line(aes(y=pred), color = "red")+
  geom_point(aes(y=pred), color = "red") +
  geom_smooth() +
  xlab("Date") +
  ylab("Predicted (in red)/ Actual (in black)")+
    ggtitle("Number of trips at different dates"))
ggplot(plot_train_data, aes(x=pred, y =num_trips ))+
  geom_point()+
  geom_abline(linetype = "dashed") +
  xlab('Predicted') +
  ylab('Actual')

rsquare(model, plot_train_data)
## [1] 0.9043868
rmse(model, plot_train_data)
## [1] 3156.534